Case-to-Code: A DNAJC6 Analysis

Author

Raval Kush

CASE TO CODE : INFANTILE PARKINSONISM

**Visualizing the structural defects in DNAJC6 after altered genotype **

Code
# Structural & Clinical Context

# --- 1. Load Libraries ---
library(r3dmol)
library(htmltools)
library(knitr)

# --- 2. Create the Corrected Mutation Table ---
# (This section remains unchanged)
patient_genotypes <- data.frame(
  Sample_Group = c("Control", "CRISPR_P1", "Patient1", "Patient2", "Patient3"),
  Genotype = c(
    "Wild-type",
    "CRISPR-corrected (from Patient 1)",
    "c.766C>T (homozygous)",
    "c.2416C>T (homozygous)",
    "c.801-2 A>G (homozygous)"
  ),
  Mutation_Type = c(
    "Normal",
    "Corrected to WT",
    "Nonsense (Exon 6)",
    "Nonsense (Exon 16)",
    "Splice acceptor (Exon 7)"
  ),
  Predicted_Protein = c(
    "Full-length DNAJC6 (910 aa)",
    "Full-length DNAJC6 (910 aa)",
    "p.R256* - Truncated at residue 256",
    "p.R806* - Truncated at residue 806",
    "Aberrant splicing → likely NMD or exon 7 skip"
  ),
  Functional_Consequence = c(
    "Normal auxilin function",
    "Restored auxilin function",
    "Loss of 72% of protein (clathrin-binding LOST)",
    "Loss of 11% of protein (clathrin-binding LOST)",
    "Likely complete loss of function"
  )
)

knitr::kable(patient_genotypes, 
             caption = "DNAJC6 Genotypes Across Patient Samples",
             align = "l")
DNAJC6 Genotypes Across Patient Samples
Sample_Group Genotype Mutation_Type Predicted_Protein Functional_Consequence
Control Wild-type Normal Full-length DNAJC6 (910 aa) Normal auxilin function
CRISPR_P1 CRISPR-corrected (from Patient 1) Corrected to WT Full-length DNAJC6 (910 aa) Restored auxilin function
Patient1 c.766C>T (homozygous) Nonsense (Exon 6) p.R256* - Truncated at residue 256 Loss of 72% of protein (clathrin-binding LOST)
Patient2 c.2416C>T (homozygous) Nonsense (Exon 16) p.R806* - Truncated at residue 806 Loss of 11% of protein (clathrin-binding LOST)
Patient3 c.801-2 A>G (homozygous) Splice acceptor (Exon 7) Aberrant splicing → likely NMD or exon 7 skip Likely complete loss of function
Code
# --- 3. Define Key Features & Domains ---
protein_length <- 913   

# Domain coordinates (User-specified)
phosphatase_tensin_domain_start <- 55
phosphatase_tensin_domain_end <- 222
C2_tensin_domain_start <- 228
C2_tensin_domain_end <- 366
j_domain_start <- 849
j_domain_end <- 913

# Mutation locations
mut_p1_residue <- 256 # p.R256*
mut_p2_residue <- 806 # p.R806*

# Exon 7 boundaries (approximate)
exon7_start <- 267   
exon7_end <- 317   


# --- 4. Read PDB Data from Local File ---
pdb_file_path <- "AF-O75061-F1-model_v6.pdb"
pdb_data <- paste(readLines(pdb_file_path), collapse = "\n")
Warning in readLines(pdb_file_path): incomplete final line found on
'AF-O75061-F1-model_v6.pdb'
Code
# --- 5. Create Interactive 3D Visualization ---
viewer <- r3dmol(
  width = "100%", 
  height = "600px",
  viewer_spec = m_viewer_spec(
    backgroundColor = "white"
  )
) %>%
  
  # Load the AlphaFold structure
  m_add_model(data = pdb_data, format = "pdb") %>%
  
  # Base style: Color by AlphaFold confidence 
  m_set_style(style = m_style_cartoon(color = "spectrum")) %>%
  
  # --- Highlight Key Functional Domains ---
  
  # Phosphatase Tensin-like Domain (Orange)
  m_set_style(
    sel = list(resi = phosphatase_tensin_domain_start:phosphatase_tensin_domain_end), 
    style = m_style_cartoon(color = "#F39B7F", thickness = 1.2)
  ) %>%
  m_add_label(
    "Phosphatase Tensin-like",
    sel = list(resi = 138),
    style = m_style_label(
      backgroundColor = "#F39B7F", 
      fontColor = "black",
      fontSize = 12
    )
  ) %>%
  
  # C2 Tensin-like Domain (Red)
  m_set_style(
    sel = list(resi = C2_tensin_domain_start:C2_tensin_domain_end), 
    style = m_style_cartoon(color = "#E64B35", thickness = 1.2)
  ) %>%
  m_add_label(
    "C2 Tensin-like Domain",
    sel = list(resi = 297),
    style = m_style_label(
      backgroundColor = "#E64B35", 
      fontColor = "white",
      fontSize = 12
    )
  ) %>%
  
  # J-Domain (Green) - HSC70 interaction
  m_set_style(
    sel = list(resi = j_domain_start:j_domain_end), 
    style = m_style_cartoon(color = "#00A087", thickness = 1.2)
  ) %>%
  m_add_label(
    "J-Domain (HSC70 binding)",
    sel = list(resi = 881),
    style = m_style_label(
      backgroundColor = "#00A087", 
      fontColor = "white",
      fontSize = 12
    )
  ) %>%
  
  # --- Clathrin-Binding Domain Block REMOVED ---
  
  # --- Highlight Patient Mutations ---
  
  # Patient 1: p.R256* (Early truncation - black sphere)
  m_add_style(
    sel = list(resi = mut_p1_residue), 
    style = m_style_sphere(color = "black", radius = 2.0)
  ) %>%
  m_add_label(
    "P1: p.R256* TRUNCATION",
    sel = list(resi = mut_p1_residue), 
    style = m_style_label(
      backgroundColor = "black", 
      fontColor = "white",
      fontSize = 11
    )
  ) %>%
  
  # Show what Patient 1 LOSES (gray out everything after 256)
  m_set_style(
    sel = list(resi = (mut_p1_residue + 1):protein_length), 
    style = m_style_cartoon(color = "lightgray", opacity = 0.3)
  ) %>%
  
  # Patient 3: Exon 7 region (Dark Gray - splice site mutation)
  m_set_style(
    sel = list(resi = exon7_start:exon7_end), 
    style = m_style_cartoon(color = "#4D4D4D", thickness = 1.3)
  ) %>%
  m_add_label(
    "P3: Exon 7 (Splice defect)",
    sel = list(resi = 292), 
    style = m_style_label(
      backgroundColor = "#4D4D4D",
      fontColor = "white",
      fontSize = 11
    )
  ) %>%

  # Patient 2: p.R806* (Late truncation - dark red sphere)
  # This is just BEFORE the J-domain
  m_add_style(
    sel = list(resi = mut_p2_residue), 
    style = m_style_sphere(color = "darkred", radius = 2.0)
  ) %>%
  m_add_label(
    "P2: p.R806* TRUNCATION",
    sel = list(resi = mut_p2_residue), 
    style = m_style_label(
      backgroundColor = "darkred", 
      fontColor = "white",
      fontSize = 11
    )
  ) %>%
  
  # Show what Patient 2 LOSES (the J-domain)
  m_set_style(
    sel = list(resi = (mut_p2_residue + 1):protein_length), 
    style = m_style_cartoon(color = "pink", opacity = 0.4)
  ) %>%

  # Zoom and enable spinning for presentation
  m_zoom_to() %>%
  m_spin(speed = 0.5)

# Display the viewer
viewer

**RStudio Code for Analysis**

PHASE 1 : Initial Requirements

1.1 : Setting seed and Loading required packages

Code
# 1.1 : Setting seed and Loading required packages

set.seed(123) # Or any integer you like


#install if you havent already
# install.packges()

# For data manipulation (dplyr, tidyr) and plotting (ggplot2)
library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
#install if you havent already
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# For differential expression analysis
library(DESeq2)
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: generics

Attaching package: 'generics'

The following object is masked from 'package:lubridate':

    as.difftime

The following object is masked from 'package:dplyr':

    explain

The following objects are masked from 'package:base':

    as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
    setequal, union


Attaching package: 'BiocGenerics'

The following object is masked from 'package:dplyr':

    combine

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
    unsplit, which.max, which.min


Attaching package: 'S4Vectors'

The following objects are masked from 'package:lubridate':

    second, second<-

The following objects are masked from 'package:dplyr':

    first, rename

The following object is masked from 'package:tidyr':

    expand

The following object is masked from 'package:utils':

    findMatches

The following objects are masked from 'package:base':

    expand.grid, I, unname

Loading required package: IRanges

Attaching package: 'IRanges'

The following object is masked from 'package:lubridate':

    %within%

The following objects are masked from 'package:dplyr':

    collapse, desc, slice

The following object is masked from 'package:purrr':

    reduce

The following object is masked from 'package:grDevices':

    windows

Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: 'matrixStats'

The following object is masked from 'package:dplyr':

    count


Attaching package: 'MatrixGenerics'

The following objects are masked from 'package:matrixStats':

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    rowWeightedSds, rowWeightedVars

Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.


Attaching package: 'Biobase'

The following object is masked from 'package:MatrixGenerics':

    rowMedians

The following objects are masked from 'package:matrixStats':

    anyMissing, rowMedians
Code
library(edgeR)
Loading required package: limma

Attaching package: 'limma'

The following object is masked from 'package:DESeq2':

    plotMA

The following object is masked from 'package:BiocGenerics':

    plotMA
Code
# For our heatmap
library(pheatmap)

# For GO, KEGG, and GSEA enrichment analysis
library(clusterProfiler) 

clusterProfiler v4.16.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan,
X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal
enrichment tool for interpreting omics data. The Innovation. 2021,
2(3):100141

Attaching package: 'clusterProfiler'

The following object is masked from 'package:IRanges':

    slice

The following object is masked from 'package:S4Vectors':

    rename

The following object is masked from 'package:purrr':

    simplify

The following object is masked from 'package:stats':

    filter
Code
library(enrichplot)
enrichplot v1.28.4 Learn more at https://yulab-smu.top/contribution-knowledge-mining/

Please cite:

Guangchuang Yu. Gene Ontology Semantic Similarity Analysis Using
GOSemSim. In: Kidder B. (eds) Stem Cell Transcriptional Networks.
Methods in Molecular Biology. 2020, 2117:207-215. Humana, New York, NY.
Code
#BiocManager::install('EnhancedVolcano')
library(EnhancedVolcano)
Loading required package: ggrepel
Code
library(AnnotationDbi)

Attaching package: 'AnnotationDbi'

The following object is masked from 'package:clusterProfiler':

    select

The following object is masked from 'package:dplyr':

    select
Code
library(org.Hs.eg.db) # This is the "dictionary" for human genes

1.2 : Defining the Analysis Parameters

Code
# 1.2 : Defining the Analysis Parameters



fdr_cutoff <- 0.05
fc_cutoff <- 2 # Fold Change (not log2)
log2fc_cutoff <- log2(fc_cutoff)
qval_cutoff <- 0.1

1.3 : Loading the count data

Code
# 1.3 : Loading the count data

# Define the path to your counts file
counts_file_path <- "GSE208353_Raw_gene_counts_matrix.txt" 

# Load the count data
# header = TRUE tells R the first row contains sample names (GSMs)
# row.names = 1 tells R the first column contains the gene names
counts_data <- read.delim(
  file = counts_file_path, 
  header = TRUE, 
  row.names = 1
)

# Always check the first few lines to make sure it loaded correctly
# You should see gene names as row names and GSM IDs as column names
head(counts_data)
                Control_1 Control_2 Control_3 CRISPRpatient1_1 CRISPRpatient1_2
ENSG00000223972         0         0         0                0                0
ENSG00000227232         0         1         3                5               11
ENSG00000278267         5         8         4                1                2
ENSG00000243485         0         0         0                0                0
ENSG00000284332         0         0         0                0                0
ENSG00000237613         0         0         0                0                0
                CRISPRpatient1_3 Patient1_1 Patient1_2 Patient1_3 Patient2_1
ENSG00000223972                0          0          0          0          0
ENSG00000227232                2          3         51          8          2
ENSG00000278267                3          1          2          3          4
ENSG00000243485                0          0          0          0          0
ENSG00000284332                0          0          0          0          0
ENSG00000237613                0          0          0          0          0
                Patient2_2 Patient2_3 Patient3_1 Patient3_2 Patient3_3
ENSG00000223972          0          0          0          0          0
ENSG00000227232         23          0          2         14          0
ENSG00000278267          2          5          2          2          2
ENSG00000243485          0          0          0          0          0
ENSG00000284332          0          0          0          0          0
ENSG00000237613          0          0          0          0          0
Code
# view(counts_data)  #(optional)

1.4 : Loading the series matrix data(or metadata)

Code
# 1.4 : Loading the series matrix data(or metadata)

# Define the path to your metadata file
metadata_file <- "metadata.csv"

# Load the metadata (comma-separated .csv file)
colData <- read.csv(
  file = metadata_file,
  row.names = 1 # Set the first column (sample_id) as row names
)

# Prepare Metadata for Analysis
# We MUST convert our experimental columns from text to "factors".
# This is how DESeq2 understands our experimental design.
colData$group <- factor(colData$group)
colData$genotype <- factor(colData$genotype)


# Final Check (The "Sanity Check")

# A. Visual Check: Let's look at our objects
print(colData)
                     group          genotype
Control_1          Control         Wild_type
Control_2          Control         Wild_type
Control_3          Control         Wild_type
CRISPRpatient1_1 CRISPR_P1 CRISPR_correction
CRISPRpatient1_2 CRISPR_P1 CRISPR_correction
CRISPRpatient1_3 CRISPR_P1 CRISPR_correction
Patient1_1        Patient1          c.766C>T
Patient1_2        Patient1          c.766C>T
Patient1_3        Patient1          c.766C>T
Patient2_1        Patient2         c.2416C>T
Patient2_2        Patient2         c.2416C>T
Patient2_3        Patient2         c.2416C>T
Patient3_1        Patient3        c.801-2A>G
Patient3_2        Patient3        c.801-2A>G
Patient3_3        Patient3        c.801-2A>G
Code
str(colData) # You should see 'Factor' for group and genotype
'data.frame':   15 obs. of  2 variables:
 $ group   : Factor w/ 5 levels "Control","CRISPR_P1",..: 1 1 1 2 2 2 3 3 3 4 ...
 $ genotype: Factor w/ 5 levels "c.2416C>T","c.766C>T",..: 5 5 5 4 4 4 2 2 2 1 ...
Code
# B. The Critical Match Check:
# This checks if the row names in 'colData' are in the
# EXACT same order as the column names in 'counts_data'.
check_result <- all(rownames(colData) == colnames(counts_data))

# should return true, to verify that the metadata rows matches the counts data columns 

print(check_result)
[1] TRUE

1.5 : Validation 1: The Case

Code
# 1.5 : Validation 1: The Case

# CHECKPOINT: Spot-Check DNAJC6 (ENSG00000116675) Counts 

# Define the Ensembl ID for DNAJC6 viz ENSG00000116675
target_gene_id <- "ENSG00000116675"

# Check if the gene exists in our data
target_gene_id %in% rownames(counts_data)
[1] TRUE
Code
# returns true when the ensembl id you designated is actually there in the rownames column of the counnts_data
  
# Create a data frame for our plot
  dnajc6_plot_df <- data.frame(
    counts = as.numeric(counts_data[target_gene_id, ]),
    group = colData$group
  )
  
# Create the plot using ggplot2
  # We use geom_jitter() to see all the individual dots
  sanity_check_plot <- ggplot(dnajc6_plot_df, aes(x = group, y = counts, color = group)) +
  geom_jitter(size = 3, width = 0.1, show.legend = FALSE) +
  theme_bw(base_size = 14) +
  labs(
    title = "Sanity Check: DNAJC6 (ENSG00000116675)",
    x = "Experimental Group",
    y = "Raw Counts"
  )

# Display the plot
sanity_check_plot

Code
# Check for Nonsense-Mediated Decay (NMD)

# We'll use the dnajc6_plot_df data frame we already made for the sanity check.
# This time, we'll use a boxplot to see the distributions.
dnajc6_counts_boxplot <- ggplot(dnajc6_plot_df, aes(x = group, y = counts, fill = group)) +
  geom_boxplot() +
  geom_jitter(width = 0.1, size = 2) +
  theme_bw(base_size = 14) +
  scale_y_log10() + # Use a log scale to see differences more clearly
  labs(
    title = "DNAJC6 (ENSG00000116675) Expression",
    subtitle = "Comparing Patient 1 (p.R256*), Patient 2 (p.R806*), and Patient 3 (splice-site)",
    x = "Experimental Group",
    y = "Raw Counts (log10 scale)"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(dnajc6_counts_boxplot)

1.6 : Validation 2 : Global Quality Control using PCA

Code
# 1.6 : Validation 2 : Global Quality Control using PCA

# Create the DESeqDataSet object
# This object bundles our counts, metadata, and experimental design.
# We'll use a simple design formula for this QC step.
dds <- DESeqDataSetFromMatrix(
  countData = counts_data,
  colData = colData,
  design = ~ group 
)

dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Code
# Apply variance-stabilizing transformation (vst)
# We MUST do this before making a PCA plot.
# It transforms the raw, noisy count data onto a scale 
# where samples can be fairly compared.
# 'blind = TRUE' is important: it ensures the transformation
# is unbiased by our experimental design.
vsd <- vst(dds, blind = TRUE)

# Generate and print the PCA plot
# We'll use the built-in 'plotPCA' function.
# 'intgroup' tells the plot which metadata columns to use for 
# coloring and shaping the dots. Using both is a great idea.
PCAplot <- plotPCA(vsd, intgroup = c("group", "genotype")) +
    labs(
      title = "PCA of Global Gene Expression",
      subtitle = "Samples clustered by experimental group"
    ) +
    theme_bw(base_size = 14) +
    # This makes the plot axes proportional (looks cleaner)
    coord_fixed() 
using ntop=500 top features by variance
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the DESeq2 package.
  Please report the issue to the authors.
Coordinate system already present.
ℹ Adding new coordinate system, which will replace the existing one.
Code
PCAplot

1.7 : Validation 3 : Global Quality Control using Dispersion Plot

Code
# 1.7 : Validation 3 : Global Quality Control using Dispersion Plot

# This checks DESeq2's model fitting. 
# We want to see the black dots (gene estimates) shrinking 
# towards the red line (fitted estimate) with blue outliers flagged.
plotDispEsts(dds) # Use the full, unfiltered dds object before minimal filtering

1.8 : Validation 4 : Global Quality Control using sample to sample heatmap

Code
# 1.8 : Validation 4 : Global Quality Control using sample to sample heatmap

# This complements the PCA plot. It shows the correlation between samples.
# We use the variance-stabilized data (vsd) we made for the PCA.
library(RColorBrewer) # For heatmap colors

# Calculate distances
sampleDists <- dist(t(assay(vsd))) 
sampleDistMatrix <- as.matrix(sampleDists)

# Define row names based on group
rownames(sampleDistMatrix) <- paste(vsd$group, vsd$genotype, sep="-") 
colnames(sampleDistMatrix) <- NULL # Hide column names for clarity

# Choose colors
colorsQC <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

# Plot the heatmap
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colorsQC,
         main = "Sample-to-Sample Distance Heatmap")

1.9 : Pre-filtering genes for further analysis

2 methods

Code
# 1.9 : Pre-filtering genes for further analysis

# (A) : Pre-filtering Low-Count Genes


# (Trying to replicate what the authors did in their original work and kinda comparing that with the results of a light pre-filter before Deseq-in the next step)

# Our 'dds' object still holds the raw, unfiltered da-ta.
# Now the cpm() function will work.
cpm_mat <- cpm(counts(dds))

# Set the filter criteria
min_cpm <- 1
min_samples <- 3

# keep_genes is a TRUE/FALSE list (op = original paper)
keep_genes_op <- rowSums(cpm_mat >= min_cpm) >= min_samples

# Now, filter the original 'dds' object
dds_filtered_op <- dds[keep_genes_op, ]


# Let's see how many genes we kept
print(paste("Original gene count:", nrow(dds)))
[1] "Original gene count: 60676"
Code
print(paste("Filtered gene count (original work):", nrow(dds_filtered_op)))
[1] "Filtered gene count (original work): 17709"
Code
# (B) : Minimal Pre-filtering (DESeq2 Best Practice) 


# We will start from our original, full 'dds' object (60,676 genes)
# which we created for the PCA plot.

# Set the minimal filter
# We'll keep genes that have a total of at least 10 reads
# across all 15 samples. This is a very light "junk" filter.
keep <- rowSums(counts(dds)) >= 10

# Filter the dds object
dds_minimal_filtered <- dds[keep, ]


# Let's see how many genes we kept
# This will be *more* genes than our 17,709, and that's good!
print(paste("Original gene count:", nrow(dds)))
[1] "Original gene count: 60676"
Code
print(paste("Filtered gene count (minimal):", nrow(dds_minimal_filtered)))
[1] "Filtered gene count (minimal): 29810"

PHASE 2 : Isogenic Analysis

2.1 : Isogenic Comparision

Code
# 2.1 : Isogenic Comparison


# Subset our minimally-filtered dataset
# We take 'dds_minimal_filtered' and keep only the columns
# where the group is "Patient1" or "CRISPR_P1".
dds_isogenic <- dds_minimal_filtered[ , dds_minimal_filtered$group %in% c("Patient1", "CRISPR_P1") ]

# IMPORTANT: Remove unused factor levels
# After subsetting, R still remembers "Patient2", "Patient3", etc.
# This command drops those unused levels, which is crucial for DESeq.
dds_isogenic$group <- droplevels(dds_isogenic$group)

# Set the "Control" Level (Reference)
# We MUST tell DESeq2 our baseline.
# We want to see Patient1 *relative to* CRISPR_P1.
dds_isogenic$group <- relevel(dds_isogenic$group, ref = "CRISPR_P1")

# Run the Analysis
# This single command runs the full DESeq pipeline
# on our 6 selected samples.
dds_isogenic <- DESeq(dds_isogenic)
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Code
# Get the Results
# This extracts the results table. DESeq2's automatic
# independent filtering (your friend's point) happens here.
res_isogenic <- results(dds_isogenic)

# CHECK THE RESULTS
# Let's see a summary
# It will tell us how many genes are "up" or "down"
# using the default 10% FDR (p-adj < 0.1).
summary(res_isogenic)

out of 29683 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 3733, 13%
LFC < 0 (down)     : 3892, 13%
outliers [1]       : 5, 0.017%
low counts [2]     : 8055, 27%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

2.2 : Visualizations for significant genes : Volcano Plot

Code
# 2.2(A) : Volcano Plot 

# Convert our results object to a data frame
# ggplot2 works best with data frames.
# Create Data Frame and Clean IDs
res_isogenic_df <- as.data.frame(res_isogenic) %>%
  rownames_to_column("ensembl_id_version") # Keep original IDs with version

# Clean IDs for mapping (remove version)
res_isogenic_df$ensembl_id_clean <- gsub("\\..*$", "", res_isogenic_df$ensembl_id_version)

# Add a column to identify significant genes
# We'll use the paper's cutoffs: padj < 0.05 and log2FC > 1 as specified before in 1.2
res_isogenic_df <- res_isogenic_df %>%
  mutate(
    significant = case_when(
      padj < fdr_cutoff & abs(log2FoldChange) > log2fc_cutoff ~ "Significant",
      TRUE ~ "Not Significant" # 'TRUE' means "everything else"
    )
  )

# How many genes met the paper's exact cutoff?
sig_genes_cutoff_isogenic <- res_isogenic_df %>%
  filter(significant == "Significant")

print(paste(
  "Total genes meeting paper's cutoff (FDR < 0.05 & FC > 2):", 
  nrow(sig_genes_cutoff_isogenic)
))
[1] "Total genes meeting paper's cutoff (FDR < 0.05 & FC > 2): 1892"
Code
# Create the Volcano Plot

isogenic_res_vol <-  ggplot(res_isogenic_df,
    aes(x = log2FoldChange, y = -log10(padj))) +
    geom_point(aes(color = significant), size = 1, alpha = 0.5) +
    scale_color_manual(
      values = c("Not Significant" = "grey", "Significant" = "dodgerblue2" ),
      name = "Significance"
    ) +
    
    # Add vertical lines for our fold-change cutoff
    geom_vline(xintercept = c(-log2fc_cutoff, log2fc_cutoff), linetype = "dashed", color = "black") +
    
    # Add a horizontal line for our p-value cutoff
    geom_hline(yintercept = -log10(fdr_cutoff), linetype = "dashed", color = "black") +
    
    labs(
      title = "Volcano Plot: Patient1 vs. CRISPR_P1",
      x = "log2(Fold Change)",
      y = "-log10(Adjusted P-value)"
    ) +
    theme_minimal(base_size = 14)

isogenic_res_vol
Warning: Removed 8187 rows containing missing values or values outside the scale range
(`geom_point()`).

Code
# (B) : Enhanced Volcano Plot 

library(EnhancedVolcano)

# Map Cleaned IDs to Gene Symbols
gene_map_symbols <- mapIds(
    org.Hs.eg.db, 
    keys = res_isogenic_df$ensembl_id_clean, 
    keytype = "ENSEMBL", 
    column = "SYMBOL", 
    multiVals = "first"
    )
'select()' returned 1:many mapping between keys and columns
Code
# Add Symbol Column to Data Frame 
# Use the cleaned Ensembl ID as the key to look up the symbol
res_isogenic_df$symbol <- gene_map_symbols[res_isogenic_df$ensembl_id_clean]

# Handle NAs: If symbol is NA, use the original Ensembl ID (with version)
res_isogenic_df$symbol <- ifelse(
    is.na(res_isogenic_df$symbol), 
    res_isogenic_df$ensembl_id_version, # Fallback to original ID
    res_isogenic_df$symbol
    )

# Identify Top Genes and DNAJC6 Symbol 

# Find the symbol corresponding to DNAJC6's cleaned ID
dnajc6_symbol <- res_isogenic_df$symbol[res_isogenic_df$ensembl_id_clean == "ENSG00000116675"][1] 

# Get the symbols of the top 10 most significant genes
iso_top_10_symbols <- res_isogenic_df %>%
  filter(!is.na(padj)) %>%
  arrange(padj) %>%
  head(10) %>%
  pull(symbol)

# Combine DNAJC6 and top 10 for labeling
iso_labels_to_show <- unique(c(dnajc6_symbol, iso_top_10_symbols))

# Generate the Plot
isogenic_enh_vol <- EnhancedVolcano(res_isogenic_df,
    lab = res_isogenic_df$symbol, # Use the new symbol column for labels
    x = 'log2FoldChange',
    y = 'padj',
    title = 'Volcano Plot: Patient1 vs. CRISPR_P1',
    subtitle = 'Isogenic Comparison',
    pCutoff = fdr_cutoff,         # Use parameterized cutoff
    FCcutoff = log2fc_cutoff,     # Use parameterized cutoff
    pointSize = 2.0,
    labSize = 4.0,
    selectLab = iso_labels_to_show,   # Label DNAJC6 and top 10 using symbols
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    max.overlaps = Inf,# Ensure all selected labels are shown
    legendPosition = 'right'

    )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the EnhancedVolcano package.
  Please report the issue to the authors.
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
ℹ The deprecated feature was likely used in the EnhancedVolcano package.
  Please report the issue to the authors.
Code
isogenic_enh_vol

2.3 : Visualizations for top 50 deferentially expressed genes

Code
# 2.3 : Heatmap 


# Get Top 50 Genes
iso_top_50_genes <- res_isogenic %>%
  as.data.frame() %>%
  rownames_to_column("ensembl_id") %>%
  filter(!is.na(padj)) %>%
  arrange(padj) %>%
  head(50)

# Get Normalized Counts (for ALL 15 samples)
vsd_counts_all_samples <- assay(vsd)
iso_top_50_counts_all_samples <- vsd_counts_all_samples[iso_top_50_genes$ensembl_id, ]

# Get the 6 sample names we *actually* want to plot
isogenic_sample_names <- colnames(dds_isogenic)

# Filter the counts matrix to only our 6 samples
iso_top_50_counts <- iso_top_50_counts_all_samples[ , isogenic_sample_names]

# Robust Gene Symbol Conversion
gene_map <- mapIds(
  org.Hs.eg.db,
  keys = rownames(iso_top_50_counts),
  keytype = "ENSEMBL",
  column = "SYMBOL"
)
'select()' returned 1:many mapping between keys and columns
Code
gene_symbols <- ifelse(is.na(gene_map), names(gene_map), gene_map)
rownames(iso_top_50_counts) <- make.names(gene_symbols, unique = TRUE)

# Create Column Annotations
# 'annotation_col' has 6 rows
# 'top_50_counts' now has 6 columns
annotation_col <- data.frame(
  Group = dds_isogenic$group
)
rownames(annotation_col) <- colnames(iso_top_50_counts) # <-- Lengths now match!

# Generate the Improved Heatmap
pheatmap(
  iso_top_50_counts,
  scale = "row",
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  show_rownames = TRUE,
  show_colnames = FALSE,
  main = "Top 50 Genes (Patient1 vs. CRISPR_P1)",
  annotation_col = annotation_col,
  border_color = NA,
  fontsize_row = 8,
)

Code
# The gene symbols are already stored as the row names 
# of the 'top_50_counts' data frame we just plotted.

iso_gene_symbol_list <- rownames(iso_top_50_counts)

# Print the list to the console
print(iso_gene_symbol_list)
 [1] "ZNF248"          "DNAJC6"          "NNAT"            "INPP5F"         
 [5] "FAM50A"          "NAP1L5"          "LOC339260"       "NR2F1"          
 [9] "CHL1"            "CTSF"            "PAX6"            "PRDM12"         
[13] "TENM1"           "FEZF1"           "BRINP2"          "MAB21L1"        
[17] "LMX1A"           "POMC"            "GABRA4"          "RGS8"           
[21] "DLX1"            "BARHL2"          "ZNF736"          "ENSG00000231764"
[25] "UNCX"            "TPH1"            "MYH14"           "SIX3"           
[29] "H19"             "NEUROD1"         "KCNJ3"           "SLCO1C1"        
[33] "DLX2"            "LRRC61"          "CRYAB"           "TBR1"           
[37] "MAGEL2"          "CNTN4"           "PTPRT"           "STUM"           
[41] "ABCA4"           "PTCHD1"          "ZFHX4.AS1"       "HMGB3"          
[45] "MEIS1"           "SLITRK2"         "ENSG00000276805" "CNTN6"          
[49] "ENSG00000267640" "ERBB4"          

2.4 : GO Enrichment (Biological Process, Molecular Function and Cellular Compartment)

Code
# 2.4 Gene Ontology Enrichment (ORA)

# Get our gene lists
res_isogenic_df <- as.data.frame(res_isogenic) %>%
  rownames_to_column("ensembl_id") # Get Ensembl IDs

iso_all_tested_gene_ids <- rownames(res_isogenic)

iso_sig_gene_ids <- res_isogenic_df %>%
  filter(padj < fdr_cutoff & abs(log2FoldChange) > log2fc_cutoff & !is.na(padj)) %>%
  pull(ensembl_id) # pull() gets the column as a vector


# We'll use gsub() to find a dot (".") and anything after it ("\\..*$")
# and replace it with nothing ("").
iso_all_gene_ids_cleaned <- gsub("\\..*$", "", iso_all_tested_gene_ids)

iso_sig_gene_ids_cleaned <- gsub("\\..*$", "", iso_sig_gene_ids)

# Convert Ensembl IDs to Entrez IDs (using cleaned IDs)
iso_all_entrez_ids <- mapIds(
  org.Hs.eg.db,
  keys = iso_all_gene_ids_cleaned, # Use cleaned IDs
  keytype = "ENSEMBL",
  column = "ENTREZID",
  multiVals = "first"
)
'select()' returned 1:many mapping between keys and columns
Code
iso_sig_entrez_ids <- mapIds(
  org.Hs.eg.db,
  keys = iso_sig_gene_ids_cleaned, # Use cleaned IDs
  keytype = "ENSEMBL",
  column = "ENTREZID",
  multiVals = "first"
)
'select()' returned 1:many mapping between keys and columns
Code
# Run the GO Enrichment (for Biological Process)
# to know more use ?enrichGO

iso_go_res_bp <- enrichGO(
  gene = iso_sig_entrez_ids,
  universe = iso_all_entrez_ids,
  OrgDb = org.Hs.eg.db,
  ont = "BP",                 # "BP" = Biological Process
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE             # This converts Entrez IDs back to Symbols
)

# CHECK THE RESULTS
# Let's look at the top 5 hits

#print(head(as.data.frame(go_results_ora), n = 5))

# Plot the Results
# A dot plot is the best way to see this
plot_isogenic_go_res_bp <- print(
  dotplot(iso_go_res_bp, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA): Top 15 Biological Processes") +
    theme_minimal()
)

Code
#print(go_results_ora$Description)

# Run the GO Enrichment (for Molecular funtion)
iso_go_res_mf <- enrichGO(
  gene = iso_sig_entrez_ids,
  universe = iso_all_entrez_ids,
  OrgDb = org.Hs.eg.db,
  ont = "MF",                 # "BP" = Biological Process
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE             # This converts Entrez IDs back to Symbols
)

plot_isogenic_go_res_mf <- print(
  dotplot(iso_go_res_mf, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA): Top 15 Molecular functions") +
    theme_minimal()
)

Code
# Run the GO Enrichment (for Cellular Compartment)
iso_go_res_cc <- enrichGO(
  gene = iso_sig_entrez_ids,
  universe = iso_all_entrez_ids,
  OrgDb = org.Hs.eg.db,
  ont = "CC",                 # "BP" = Biological Process
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE             # This converts Entrez IDs back to Symbols
)

plot_isogenic_go_res_cc <- print(
  dotplot(iso_go_res_cc, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA): Top 15 Cellular Compartments") +
    theme_minimal()
)

2.5 : Gene Set Enrichment Analysis

Code
# 2.5 : Gene Set Enrichment Analysis (GSEA)


# Use 'select' to get a mapping data frame.
# This is a bit more robust than mapIds for this purpose.
iso_gene_map_df <- AnnotationDbi::select(
  org.Hs.eg.db,
  keys = iso_all_gene_ids_cleaned,
  columns = c("ENTREZID"),
  keytype = "ENSEMBL"
) %>%
  # remove duplicates, just keep first mapping
  distinct(ENSEMBL, .keep_all = TRUE) 
'select()' returned 1:many mapping between keys and columns
Code
# Join map with our results
# First, add the cleaned ENSEMBL ID to our results
res_isogenic_df$ENSEMBL <- iso_all_gene_ids_cleaned

# Now, join them
iso_res_mapped <- inner_join(res_isogenic_df, iso_gene_map_df, by = "ENSEMBL")


# Handle Duplicate Entrez IDs
# We'll pick the gene with the "strongest" signal (highest absolute stat)
iso_res_ranked <- iso_res_mapped %>%
  filter(!is.na(stat) & !is.na(ENTREZID)) %>%
  # Group by Entrez ID
  group_by(ENTREZID) %>%
  # Find the row with the max absolute 'stat' value
  slice_max(order_by = abs(stat), n = 1) %>%
  ungroup()

# Create the Final Ranked List
# This is the special vector GSEA needs:
# values = the 'stat' column
# names = the 'ENTREZID' column


iso_gene_list_gsea <- iso_res_ranked$stat
names(iso_gene_list_gsea) <- iso_res_ranked$ENTREZID

# CRITICAL: Sort the list in decreasing order
iso_gene_list_gsea_sorted <- sort(iso_gene_list_gsea, decreasing = TRUE)

# Run GSEA for Biological Process
# to know more use ?gseGO
iso_gsea_bp_res <- gseGO(
  geneList = iso_gene_list_gsea_sorted,
  OrgDb = org.Hs.eg.db,
  ont = "BP", # Biological Process
  pvalueCutoff = fdr_cutoff,
  pAdjustMethod = "BH",
  verbose = TRUE # Shows progress
)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (3.7% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
Code
# Plot the GSEA Results
# Dot plot of top 20 pathways
iso_gsea_bp_plot <- dotplot(iso_gsea_bp_res, showCategory = 15) +
    labs(title = "GSEA Results: Top 15 Biological Processes") +
    theme_minimal()

iso_gsea_bp_plot

Code
# Run GSEA for Molecular function
iso_gsea_mf_res <- gseGO(
  geneList = iso_gene_list_gsea_sorted,
  OrgDb = org.Hs.eg.db,
  ont = "MF", # Molecular function
  pvalueCutoff = fdr_cutoff,
  pAdjustMethod = "BH",
  verbose = TRUE # Shows progress
)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (3.7% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
Code
# Plot the GSEA Results
# Dot plot of top 20 funtions affected
iso_gsea_mf_plot <- dotplot(iso_gsea_mf_res, showCategory = 15) +
    labs(title = "GSEA Results: Top 15 Molecular Functions") +
    theme_minimal()

iso_gsea_mf_plot

Code
# Run GSEA for Cellular Compartment
iso_gsea_cc_res <- gseGO(
  geneList = iso_gene_list_gsea_sorted,
  OrgDb = org.Hs.eg.db,
  ont = "CC", # Molecular function
  pvalueCutoff = fdr_cutoff,
  pAdjustMethod = "BH",
  verbose = TRUE # Shows progress
)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (3.7% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
Code
# Plot the GSEA Results
# Dot plot of top 20 compartments
iso_gsea_cc_plot <- dotplot(iso_gsea_cc_res, showCategory = 15) +
    labs(title = "GSEA Results: Top 15 Cellular Compartments") +
    theme_minimal()

iso_gsea_cc_plot

2.6 : Counting Keyword Frequencies in GSEA Results

Code
# 2.6 : Count Keyword Frequencies in GSEA Results


# Get the list of all descriptions
iso_gsea_descriptions <- as.data.frame(iso_gsea_bp_res)$Description %>%
  c(as.data.frame(iso_gsea_mf_res)$Description) %>%
  c(as.data.frame(iso_gsea_cc_res)$Description)

# Define your keywords of interest (customize this list!)
# We'll use some key themes we've seen.
keywords_to_count <- c(
  # Synaptic Function & Vesicle Recycling
  "synap", "synapse", "synaptic", "synaptogenesis", "vesicle", "clathrin", 
  "endocytosis", "neurotransmitter", "transmission", "recycle", "recycling",

  # Neurodevelopment & Cell Fate
  "develop", "development", "developmental", "neurog", "neurogenesis", "neuron", 
  "neuronal", "differentiation", "fate", "specification", "progenitor", 
  "morphogenesis", "Wnt", "signal", "signaling",

  # Neuronal Structure
  "axon", "axonogenesis", "dendrite", "dendritic", "terminus", "terminal", 
  "cleft", "junction", "growth cone", "projection",

  # Transcription & Regulation
  "transcription", "regulation", "binding", "DNA binding", "protein binding", 
  "factor", "repressor", "activator",

  # Cell-Specific Keywords
  "dopamine", "dopaminergic", "midbrain", "striat", "striatum", "corpus striatum"
)

# Count occurrences for each keyword (case-insensitive)
iso_keyword_counts <- map_int(keywords_to_count, ~sum(str_detect(iso_gsea_descriptions, regex(.x, ignore_case = TRUE))))

# Create a nice summary table
iso_keyword_summary <- tibble(
  Keyword = keywords_to_count,
  Count = iso_keyword_counts
) %>%
  arrange(desc(Count)) # Sort by most frequent

# Print the summary
print(iso_keyword_summary, n = 50) # Print more rows if needed
# A tibble: 50 × 2
   Keyword          Count
   <chr>            <int>
 1 regulation         222
 2 develop             83
 3 development         83
 4 synap               66
 5 binding             59
 6 differentiation     45
 7 synaptic            43
 8 vesicle             42
 9 signal              41
10 signaling           35
11 neuron              28
12 synapse             23
13 transcription       23
14 axon                22
15 factor              20
16 morphogenesis       18
17 junction            12
18 projection          10
19 striat               9
20 Wnt                  8
21 DNA binding          8
22 transmission         7
23 fate                 6
24 neurotransmitter     5
25 developmental        5
26 specification        5
27 endocytosis          4
28 dendrite             4
29 dopamine             4
30 neurog               3
31 neurogenesis         3
32 neuronal             3
33 axonogenesis         3
34 dendritic            3
35 activator            3
36 terminus             2
37 repressor            2
38 dopaminergic         2
39 midbrain             2
40 recycling            1
41 cleft                1
42 growth cone          1
43 protein binding      1
44 synaptogenesis       0
45 clathrin             0
46 recycle              0
47 progenitor           0
48 terminal             0
49 striatum             0
50 corpus striatum      0

PHASE 3 : All Patients vs Control

3.1 : Deferential Gene Expression Analysis for all patients vs controls

Code
# 3.1 : Analysis 2 - All Patients vs. Controls

# Prepare the Metadata
# We need a new column defining our broad groups: "Patient" vs "Control".
# We'll treat the CRISPR-corrected line as a 'Control' for this comparison,
# as it represents the non-diseased state.
colData <- colData %>%
  mutate(status = case_when(
    group %in% c("Control", "CRISPR_P1") ~ "Control_Group",
    group %in% c("Patient1", "Patient2", "Patient3") ~ "Patient_Group"
  ))

# Convert the new column to a factor
colData$status <- factor(colData$status)

# Create the DESeqDataSet
# We use our 'dds_minimal_filtered' object, which has all 15 samples.
# The design formula is key: ~ genotype + status
# This tells DESeq2: "First, account for differences due to the specific
# 'genotype', and THEN find the differences due to 'status'."
dds_all_patients <- DESeqDataSetFromMatrix(
  countData = counts(dds_minimal_filtered), # Use counts from the filtered object
  colData = colData,                         # Use our updated colData
  design = ~ status
)

# Set the "Control" Level
# We must tell DESeq2 which 'status' is the reference.
dds_all_patients$status <- relevel(dds_all_patients$status, ref = "Control_Group")

# Run the Analysis
# This might take slightly longer as it's using all 15 samples.
dds_all_patients <- DESeq(dds_all_patients)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 60 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
Code
# Get the Results
# We explicitly ask for the comparison "Patient_Group vs Control_Group".
res_all_patients <- results(dds_all_patients, name = "status_Patient_Group_vs_Control_Group")

# CHECK THE RESULTS
# Let's see the summary (default 10% FDR).
summary(res_all_patients)

out of 29809 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 4082, 14%
LFC < 0 (down)     : 4510, 15%
outliers [1]       : 47, 0.16%
low counts [2]     : 2313, 7.8%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

3.2 : Visualizations for significant genes : Volcano Plot

Code
# 3.2 : Volcano Plots

# Setup: Ensure results are in a data frame
# (Assuming res_all_patients is already created)
res_all_patients_df <- as.data.frame(res_all_patients) %>%
  rownames_to_column("ensembl_id_version") # Keep original IDs

res_all_patients_df$ensembl_id_clean <- gsub("\\..*$", "", res_all_patients_df$ensembl_id_version)

# 3.2(A) : Standard ggplot2 Volcano

# Add significance column 
res_all_patients_df <- res_all_patients_df %>%
  mutate(
    significant = case_when(
      padj < fdr_cutoff & abs(log2FoldChange) > log2fc_cutoff ~ "Significant",
      TRUE ~ "Not Significant"
    )
  )

# Count significant genes 
sig_genes_cutoff_all <- res_all_patients_df %>%
  filter(significant == "Significant")

print(paste(
  "Total genes (Analysis 2) meeting cutoff:",
  nrow(sig_genes_cutoff_all)
))
[1] "Total genes (Analysis 2) meeting cutoff: 3496"
Code
# Create the ggplot Volcano Plot
# (Variables renamed for consistency with Analysis 1)
all_patients_res_vol <- ggplot(res_all_patients_df, aes(x = log2FoldChange, y = -log10(padj))) +
    geom_point(aes(color = significant), size = 1, alpha = 0.5) + 
    scale_color_manual(
      values = c("Not Significant" = "grey", "Significant" = "dodgerblue2"),
      name = "Significance" # Set to match Analysis 1
    ) +
    geom_vline(xintercept = c(-log2fc_cutoff, log2fc_cutoff), linetype = "dashed", color = "black") +
    geom_hline(yintercept = -log10(fdr_cutoff), linetype = "dashed", color = "black") +
    labs(
      title = "Volcano Plot: All Patients vs. Controls",
      x = "log2(Fold Change)",
      y = "-log10(Adjusted P-value)"
    ) +
    theme_minimal(base_size = 14)

print(all_patients_res_vol) # Display the ggplot volcano
Warning: Removed 2360 rows containing missing values or values outside the scale range
(`geom_point()`).

Code
# 3.2(B) : EnhancedVolcano Plot with Gene Symbols
library(EnhancedVolcano)

# Map IDs to Symbols for this dataframe
gene_map_symbols_all <- mapIds(
    org.Hs.eg.db, 
    keys = res_all_patients_df$ensembl_id_clean, 
    keytype = "ENSEMBL", 
    column = "SYMBOL", 
    multiVals = "first"
    )
'select()' returned 1:many mapping between keys and columns
Code
res_all_patients_df$symbol <- gene_map_symbols_all[res_all_patients_df$ensembl_id_clean]
res_all_patients_df$symbol <- ifelse(
    is.na(res_all_patients_df$symbol), 
    res_all_patients_df$ensembl_id_version, 
    res_all_patients_df$symbol
    )

# Identify Top Genes and DNAJC6 Symbol
# (Variables renamed for consistency with Analysis 1)
dnajc6_symbol_all <- res_all_patients_df$symbol[res_all_patients_df$ensembl_id_clean == "ENSG00000116675"][1] 

all_top_10_symbols <- res_all_patients_df %>%
  filter(!is.na(padj)) %>%
  arrange(padj) %>%
  head(10) %>%
  pull(symbol)

all_labels_to_show <- unique(c(dnajc6_symbol_all, all_top_10_symbols))

# Generate the EnhancedVolcano Plot
all_patients_enh_vol <- EnhancedVolcano(res_all_patients_df,
    lab = res_all_patients_df$symbol, 
    x = 'log2FoldChange',
    y = 'padj',
    title = 'Volcano Plot: All Patients vs. Controls',
    subtitle = 'Broader Comparison',
    pCutoff = fdr_cutoff,        
    FCcutoff = log2fc_cutoff,      
    pointSize = 2.0,
    labSize = 4.0,
    selectLab = all_labels_to_show, # Use new consistent variable
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    max.overlaps = Inf,
    legendPosition = 'right'
)


all_patients_enh_vol # Display the plot

3.3 : Visualizations for top 50 deferentially expressed genes

Code
# 3.3 : Heatmap

# Get Top 50 Genes
all_top_50_genes <- res_all_patients %>%
  as.data.frame() %>%
  rownames_to_column("ensembl_id") %>% # Renamed column to match theme
  filter(!is.na(padj)) %>%
  arrange(padj) %>%
  head(50)

# Get Normalized Counts (for ALL 15 samples)
# We assume 'vsd_counts_all_samples' still exists from the PCA step
# vsd_counts_all_samples <- assay(vsd) 
all_top_50_counts <- vsd_counts_all_samples[all_top_50_genes$ensembl_id, ]

# Robust Gene Symbol Conversion
gene_map_all <- mapIds(
  org.Hs.eg.db,
  keys = rownames(all_top_50_counts), # Using rownames directly
  keytype = "ENSEMBL",
  column = "SYMBOL"
)
'select()' returned 1:many mapping between keys and columns
Code
gene_symbols_all <- ifelse(is.na(gene_map_all), names(gene_map_all), gene_map_all)
rownames(all_top_50_counts) <- make.names(gene_symbols_all, unique = TRUE)

# Create Column Annotations
# This will have 15 rows for all 15 samples
annotation_col_all <- data.frame(
  Status = colData$status # Using the 'status' column (e.g., "Patient", "Control")
)
rownames(annotation_col_all) <- colnames(all_top_50_counts) 

# Generate the Heatmap
pheatmap(
  all_top_50_counts,
  scale = "row",
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  show_rownames = TRUE,
  show_colnames = FALSE,
  main = "Top 50 Genes (All Patients vs. Controls)",
  annotation_col = annotation_col_all,
  border_color = NA,
  fontsize_row = 8
)

Code
# The gene symbols are stored as the row names 
# of the 'all_top_50_counts' data frame we just plotted.
all_gene_symbol_list <- rownames(all_top_50_counts)

# Print the list to the console
# (Added this section to match your theme)
print(all_gene_symbol_list)
 [1] "INPP5F"          "DNAJC6"          "NNAT"            "EBF1"           
 [5] "STUM"            "OTX1"            "SEC24A"          "COG6"           
 [9] "SEC23A"          "FAM181B"         "PPP1R11"         "NPBWR2"         
[13] "GFPT1"           "ZNF215"          "RGMA"            "ABCA4"          
[17] "SCFD2"           "PAX6"            "ENSG00000286449" "FOXB1"          
[21] "TMEM163"         "POU4F1"          "SV2B"            "MMP10"          
[25] "ADK"             "TOPBP1"          "MRAP2"           "GABPB2"         
[29] "ENSG00000262877" "NPFFR1"          "SLC15A4"         "DDX50"          
[33] "MEIS1"           "CCBE1"           "FOXA2"           "BDNF"           
[37] "NFU1"            "SEPTIN6"         "NEFH"            "MPC1"           
[41] "UBA6.DT"         "FAM177A1"        "GTPBP6"          "ZNF648"         
[45] "ALDH1L2"         "PEG3"            "PCSK1"           "CRIPT"          
[49] "VAV3"            "NEUROD1"        

3.4 : GO Enrichment (Biological Process, Molecular Function and Cellular Compartment)

Code
# 3.4 Gene Ontology Enrichment (ORA)


# Get our gene lists
# We assume 'res_all_patients_df' exists from section 3.2
all_all_tested_gene_ids <- rownames(res_all_patients)

# Get sig genes using the 'significant' column created in 3.2
all_sig_gene_ids <- res_all_patients_df %>%
  filter(significant == "Significant") %>%
  pull(ensembl_id_version) # pull() gets the column as a vector

# Clean the Ensembl IDs (remove version)
all_all_gene_ids_cleaned <- gsub("\\..*$", "", all_all_tested_gene_ids)
all_sig_gene_ids_cleaned <- gsub("\\..*$", "", all_sig_gene_ids)

# Convert Ensembl IDs to Entrez IDs (using cleaned IDs)
all_all_entrez_ids <- mapIds(
  org.Hs.eg.db,
  keys = all_all_gene_ids_cleaned, # Use cleaned IDs
  keytype = "ENSEMBL",
  column = "ENTREZID",
  multiVals = "first"
)
'select()' returned 1:many mapping between keys and columns
Code
all_sig_entrez_ids <- mapIds(
  org.Hs.eg.db,
  keys = all_sig_gene_ids_cleaned, # Use cleaned IDs
  keytype = "ENSEMBL",
  column = "ENTREZID",
  multiVals = "first"
)
'select()' returned 1:many mapping between keys and columns
Code
# --- Run GO Enrichment (BP) ---
all_go_res_bp <- enrichGO(
  gene = all_sig_entrez_ids,
  universe = all_all_entrez_ids,
  OrgDb = org.Hs.eg.db,
  ont = "BP",           
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE       
)

# Plot the Results (BP)
plot_all_patients_go_res_bp <- print(
  dotplot(all_go_res_bp, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA - All Patients): Top 15 BP") +
    theme_minimal()
)

Code
# --- Run GO Enrichment (MF) ---
all_go_res_mf <- enrichGO(
  gene = all_sig_entrez_ids,
  universe = all_all_entrez_ids,
  OrgDb = org.Hs.eg.db,
  ont = "MF",           
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE       
)

# Plot the Results (MF)
plot_all_patients_go_res_mf <- print(
  dotplot(all_go_res_mf, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA - All Patients): Top 15 Molecular Functions") +
    theme_minimal()
)

Code
# --- Run GO Enrichment (CC) ---
all_go_res_cc <- enrichGO(
  gene = all_sig_entrez_ids,
  universe = all_all_entrez_ids,
  OrgDb = org.Hs.eg.db,
  ont = "CC",           
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE       
)

# Plot the Results (CC)
plot_all_patients_go_res_cc <- print(
  dotplot(all_go_res_cc, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA - All Patients): Top 15 Cellular Compartments") +
    theme_minimal()
)

3.5 : Gene Set Enrichment Analysis

Code
# 3.5 : Gene Set Enrichment Analysis (GSEA)

# Use 'select' to get a mapping data frame.
# We assume 'all_all_gene_ids_cleaned' exists from section
all_gene_map_df <- AnnotationDbi::select(
  org.Hs.eg.db,
  keys = all_all_gene_ids_cleaned,
  columns = c("ENTREZID"),
  keytype = "ENSEMBL"
) %>%
  # remove duplicates, just keep first mapping
  distinct(ENSEMBL, .keep_all = TRUE) 
'select()' returned 1:many mapping between keys and columns
Code
# Join map with our results
# We assume 'res_all_patients_df' 
# Add the cleaned ENSEMBL ID to our results df
res_all_patients_df$ENSEMBL <- all_all_gene_ids_cleaned

# Now, join them
all_res_mapped <- inner_join(res_all_patients_df, all_gene_map_df, by = "ENSEMBL")

# Handle Duplicate Entrez IDs
# We'll pick the gene with the "strongest" signal (highest absolute stat)
all_res_ranked <- all_res_mapped %>%
  filter(!is.na(stat) & !is.na(ENTREZID)) %>%
  # Group by Entrez ID
  group_by(ENTREZID) %>%
  # Find the row with the max absolute 'stat' value
  slice_max(order_by = abs(stat), n = 1) %>%
  ungroup()

# Create the Final Ranked List
all_gene_list_gsea <- all_res_ranked$stat
names(all_gene_list_gsea) <- all_res_ranked$ENTREZID

# CRITICAL: Sort the list in decreasing order
all_gene_list_gsea_sorted <- sort(all_gene_list_gsea, decreasing = TRUE)

# --- Run GSEA for Biological Process ---
all_gsea_bp_res <- gseGO(
  geneList = all_gene_list_gsea_sorted,
  OrgDb = org.Hs.eg.db,
  ont = "BP", # Biological Process
  pvalueCutoff = fdr_cutoff,
  pAdjustMethod = "BH",
  verbose = TRUE # Shows progress
)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
Code
# Plot the GSEA Results
all_gsea_bp_plot <- dotplot(all_gsea_bp_res, showCategory = 15) +
    labs(title = "GSEA Results (All Patients): Top 15 Biological Processes") +
    theme_minimal()

all_gsea_bp_plot

Code
# --- Run GSEA for Molecular function ---
all_gsea_mf_res <- gseGO(
  geneList = all_gene_list_gsea_sorted,
  OrgDb = org.Hs.eg.db,
  ont = "MF", # Molecular function
  pvalueCutoff = fdr_cutoff,
  pAdjustMethod = "BH",
  verbose = TRUE # Shows progress
)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
Code
# Plot the GSEA Results
all_gsea_mf_plot <- dotplot(all_gsea_mf_res, showCategory = 15) +
    labs(title = "GSEA Results (All Patients): Top 15 Molecular Functions") +
    theme_minimal()

all_gsea_mf_plot

Code
# --- Run GSEA for Cellular Compartment ---
all_gsea_cc_res <- gseGO(
  geneList = all_gene_list_gsea_sorted,
  OrgDb = org.Hs.eg.db,
  ont = "CC", # Cellular Compartment
  pvalueCutoff = fdr_cutoff,
  pAdjustMethod = "BH",
  verbose = TRUE # Shows progress
)
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
Code
# Plot the GSEA Results
all_gsea_cc_plot <- dotplot(all_gsea_cc_res, showCategory = 15) +
    labs(title = "GSEA Results (All Patients): Top 15 Cellular Compartments") +
    theme_minimal()

all_gsea_cc_plot

3.6 : Counting Keyword Frequencies in GSEA Results

Code
# 3.6 : Count Keyword Frequencies in GSEA Results (ANALYSIS 2) ###

# Get the list of all descriptions from all three ontologies
# (This now matches the theme)
all_gsea_descriptions <- as.data.frame(all_gsea_bp_res)$Description %>%
  c(as.data.frame(all_gsea_mf_res)$Description) %>%
  c(as.data.frame(all_gsea_cc_res)$Description)

# We assume 'keywords_to_count' is already defined from section 2.6
# keywords_to_count <- c(...) 

# Count occurrences for each keyword (case-insensitive)
all_keyword_counts <- map_int(keywords_to_count, ~sum(str_detect(all_gsea_descriptions, regex(.x, ignore_case = TRUE))))

# Create a nice summary table
all_keyword_summary <- tibble(
  Keyword = keywords_to_count,
  Count = all_keyword_counts
) %>%
  arrange(desc(Count)) # Sort by most frequent

# Print the summary
print(all_keyword_summary, n = 50) # Print more rows if needed
# A tibble: 50 × 2
   Keyword          Count
   <chr>            <int>
 1 regulation         463
 2 develop            180
 3 development        180
 4 differentiation    100
 5 signal              97
 6 morphogenesis       82
 7 signaling           80
 8 binding             80
 9 factor              72
10 synap               43
11 neuron              30
12 vesicle             27
13 transcription       26
14 synaptic            25
15 junction            22
16 synapse             18
17 fate                13
18 axon                13
19 Wnt                 12
20 striat              12
21 projection          11
22 transmission         9
23 specification        8
24 endocytosis          5
25 developmental        5
26 DNA binding          5
27 protein binding      5
28 activator            5
29 axonogenesis         4
30 neurog               3
31 neurogenesis         3
32 dendritic            3
33 repressor            3
34 dopamine             3
35 dendrite             2
36 dopaminergic         2
37 midbrain             2
38 neuronal             1
39 cleft                1
40 synaptogenesis       0
41 clathrin             0
42 neurotransmitter     0
43 recycle              0
44 recycling            0
45 progenitor           0
46 terminus             0
47 terminal             0
48 growth cone          0
49 striatum             0
50 corpus striatum      0

PHASE 4 : Common Analysis

4.1 : Compare Significant Gene Lists

Code
# 4.1 : Compare Significant Gene Lists (Aesthetic Venn Diagram)

# Load the libraries
library(ggVennDiagram)

Attaching package: 'ggVennDiagram'
The following object is masked from 'package:tidyr':

    unite
Code
library(ggplot2) 

# 1. Get the significant gene lists (Cleaned IDs)
# We'll use the data frames we created in the volcano plot sections
# (sig_genes_cutoff_isogenic from 2.2 and sig_genes_cutoff_all from 4.2)
iso_sig_ids_cleaned <- sig_genes_cutoff_isogenic$ensembl_id_clean
all_sig_ids_cleaned <- sig_genes_cutoff_all$ensembl_id_clean

# Create the list object
gene_lists <- list(
  Isogenic = iso_sig_ids_cleaned,
 `All Patients` = all_sig_ids_cleaned 
)

# Generate the Aesthetic Venn Diagram
overlap_venn_plot <- ggVennDiagram(
    gene_lists,
    category.names = c("Isogenic Analysis", "All Patients Analysis"),
    label = "both",           # Show count and percent (e.g., 957 (22%))
    label_geom = "text",      # Use plain text for labels (no box)
    label_color = "black",    # Color of the count/percent text
    label_size = 5.5,         # Make count/percent text larger
    label_percent_digit = 1   # Show one decimal place (e.g., 21.5%)
  ) +
  # Use a nice blue/cyan color palette
  scale_fill_gradient(low = "#81D4FA", high = "#0277BD") + 
  
  # Add titles and subtitle
  labs(
    title = "Overlap of Significant Genes",
    subtitle = paste0("FDR < ", fdr_cutoff, " & Fold Change > ", fc_cutoff)
  ) +
  
  # Use a clean, blank theme
  theme_void() + 
  
  # Style and center the titles
  theme(
    plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    legend.position = "none" # Hide the gradient legend
  ) +
  
  coord_sf(clip = "off") 
Coordinate system already present.
ℹ Adding new coordinate system, which will replace the existing one.
Code
print(overlap_venn_plot) # Display the plot

Code
# Get the list of overlapping genes (The "Core" Signature) 
core_sig_gene_ids <- intersect(iso_sig_ids_cleaned, all_sig_ids_cleaned)

print(paste("Number of core overlapping genes:", length(core_sig_gene_ids)))
[1] "Number of core overlapping genes: 957"

4.2 : Map Core Signature Genes to Symbols

Code
# 4.2 : Map Core Signature Genes to Symbols

# We will use the 'core_sig_gene_ids' vector 
# (from the previous Venn diagram step, section 5.1)

# Map Ensembl IDs to Gene Symbols
# 'core_sig_gene_ids' already contains cleaned IDs (no version).
core_gene_map <- mapIds(
  org.Hs.eg.db,
  keys = core_sig_gene_ids, # Use the consistent variable
  keytype = "ENSEMBL",
  column = "SYMBOL",
  multiVals = "first" # Handle one-to-many mappings
)
'select()' returned 1:many mapping between keys and columns
Code
# Handle NAs (If a symbol isn't found, keep the Ensembl ID)
core_gene_symbols <- ifelse(
    is.na(core_gene_map), 
    names(core_gene_map), # Use the Ensembl ID as a fallback
    core_gene_map          # Otherwise, use the mapped symbol
    )

# Create a tibble for easier viewing and sorting
# (Using tibble for theme consistency)
core_gene_df <- tibble(
  Ensembl_ID = names(core_gene_map), 
  Gene_Symbol = core_gene_symbols
) %>% 
  arrange(Gene_Symbol) # Sort alphabetically by symbol

# Print the resulting tibble
print(core_gene_df, n = 20) # Print top 20
# A tibble: 957 × 2
   Ensembl_ID      Gene_Symbol
   <chr>           <chr>      
 1 ENSG00000198691 ABCA4      
 2 ENSG00000085563 ABCB1      
 3 ENSG00000005471 ABCB4      
 4 ENSG00000143921 ABCG8      
 5 ENSG00000213088 ACKR1      
 6 ENSG00000102575 ACP5       
 7 ENSG00000154930 ACSS1      
 8 ENSG00000159251 ACTC1      
 9 ENSG00000077522 ACTN2      
10 ENSG00000106526 ACTR3C     
11 ENSG00000123612 ACVR1C     
12 ENSG00000140470 ADAMTS17   
13 ENSG00000140873 ADAMTS18   
14 ENSG00000156218 ADAMTSL3   
15 ENSG00000185736 ADARB2     
16 ENSG00000143199 ADCY10     
17 ENSG00000141433 ADCYAP1    
18 ENSG00000112414 ADGRG6     
19 ENSG00000156110 ADK        
20 ENSG00000170425 ADORA2B    
# ℹ 937 more rows
Code
overlap_df <- data.frame(
  Ensembl_ID = names(core_gene_map), 
  Gene_Symbol = core_gene_symbols
) %>% 
  arrange(Gene_Symbol) # Sort alphabetically by symbol


# You can also print just the list of symbols:
# print(core_gene_df$Gene_Symbol, n = 1000) 

# You might want to save this list to a file for later reference:
#write.csv(core_gene_df, "core_signature_genes.csv", row.names = FALSE)

# You might want to save this list to a file for later reference:
#write.table(overlap_df$Gene_Symbol, "overlapping genes.txt", sep = " ", row.names = FALSE)

4.3 : GO Enrichment (Biological Process, Molecular Function and Cellular Compartment)

Code
# 4.3 : GO Enrichment (ORA) on Core Signature Genes ###

# Get Gene Lists (Entrez IDs)

# We will use 'core_sig_gene_ids' (Ensembl) from section 4.1
# We will use 'iso_all_entrez_ids' (Entrez) from section 2.4 as our universe

# Convert the Core Signature Ensembl IDs to Entrez IDs
core_sig_entrez_ids <- mapIds(
  org.Hs.eg.db,
  keys = core_sig_gene_ids, # From 5.1
  keytype = "ENSEMBL",
  column = "ENTREZID",
  multiVals = "first"
)
'select()' returned 1:many mapping between keys and columns
Code
# Run GO Enrichment (BP)
core_go_res_bp <- enrichGO(
  gene = core_sig_entrez_ids,
  universe = iso_all_entrez_ids, # Re-using from 2.4
  OrgDb = org.Hs.eg.db,
  ont = "BP",           
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE       
)

# Plot the Results (BP)
plot_core_go_res_bp <- print(
  dotplot(core_go_res_bp, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA - Core Signature): Top 15 BP") +
    theme_minimal()
)

Code
# Run GO Enrichment (MF)
core_go_res_mf <- enrichGO(
  gene = core_sig_entrez_ids,
  universe = iso_all_entrez_ids, 
  OrgDb = org.Hs.eg.db,
  ont = "MF",           
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE       
)

# Plot the Results (MF)
plot_core_go_res_mf <- print(
  dotplot(core_go_res_mf, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA - Core Signature): Top 15 Molecular Functions") +
    theme_minimal()
)

Code
# Run GO Enrichment (CC)
core_go_res_cc <- enrichGO(
  gene = core_sig_entrez_ids,
  universe = iso_all_entrez_ids, 
  OrgDb = org.Hs.eg.db,
  ont = "CC",           
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff,
  readable = TRUE       
)

# Plot the Results (CC)
plot_core_go_res_cc <- print(
  dotplot(core_go_res_cc, showCategory = 15) + 
    labs(title = "GO Enrichment (ORA - Core Signature): Top 15 Cellular Compartments") +
    theme_minimal()
)

4.4 : KEGG Analysis

Code
# 4.4 : KEGG Enrichment (ORA) on Core Signature Genes

# We will re-use the same Entrez ID lists from section 5.3:
# 'core_sig_entrez_ids' (the core significant genes)
# 'iso_all_entrez_ids' (the background universe from 2.4)

# Run KEGG Enrichment (ORA)
# Note: Requires an internet connection.
core_kegg_res <- enrichKEGG(
  gene = core_sig_entrez_ids,
  universe = iso_all_entrez_ids,
  organism = 'hsa', # 'hsa' is Homo sapiens
  pAdjustMethod = "BH",
  pvalueCutoff = fdr_cutoff,
  qvalueCutoff = qval_cutoff
)
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
Code
# Convert KEGG IDs back to Gene Symbols (for readability)
if (!is.null(core_kegg_res)) {
  core_kegg_res <- setReadable(core_kegg_res, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
} else {
  print("No significant KEGG pathways found.")
}

# CHECK THE RESULTS
print("--- Top KEGG Pathway Results (ORA - Core Signature) ---")
[1] "--- Top KEGG Pathway Results (ORA - Core Signature) ---"
Code
if (!is.null(core_kegg_res)) {
  print(head(as.data.frame(core_kegg_res), n = 15)) # Show top 15
} 
                                     category
hsa04080 Environmental Information Processing
hsa04082                                 <NA>
hsa04512 Environmental Information Processing
                                 subcategory       ID
hsa04080 Signaling molecules and interaction hsa04080
hsa04082                                <NA> hsa04082
hsa04512 Signaling molecules and interaction hsa04512
                                     Description GeneRatio  BgRatio RichFactor
hsa04080 Neuroactive ligand-receptor interaction    37/330 310/7188  0.1193548
hsa04082            Neuroactive ligand signaling    23/330 189/7188  0.1216931
hsa04512                ECM-receptor interaction    12/330  85/7188  0.1411765
         FoldEnrichment   zScore       pvalue     p.adjust       qvalue
hsa04080       2.599765 6.315942 6.236878e-08 0.0000177751 1.746326e-05
hsa04082       2.650697 5.044434 1.604211e-05 0.0022860009 2.245896e-03
hsa04512       3.075080 4.221390 4.592110e-04 0.0436250434 4.285969e-02
                                                                                                                                                                                                                           geneID
hsa04080 POMC/TACR1/CALCRL/CHRNG/GRM7/OXTR/HRH1/CCK/SST/CHRNA9/GLRA1/HTR1E/GABRR1/GRM1/GRM3/TAC1/GRM8/GLRA2/CHRNB3/CALCB/DRD2/NPFFR1/ADRA2A/PMCH/CYSLTR2/SSTR1/CHRNA5/NPW/ADORA2B/ADCYAP1/GRP/MC4R/GALR1/NPBWR2/KISS1R/TSPO/GRIK1
hsa04082                                                                            POMC/TACR1/GRM7/HRH1/GLRA1/HTR1E/GABRR1/GRM1/GRM3/TAC1/GRM8/SLC18A1/SLC17A6/DRD2/HTR3A/CHAT/SLC18A3/ADRA2A/SLC18A2/ADORA2B/MC4R/SLC32A1/GRIK1
hsa04512                                                                                                                                                        TNR/ITGA4/FRAS1/SV2C/THBS4/COL4A6/FREM2/COL4A2/SV2B/GP1BA/VTN/GP6
         Count
hsa04080    37
hsa04082    23
hsa04512    12
Code
# Plot the Results (if any were found)
if (!is.null(core_kegg_res) && nrow(as.data.frame(core_kegg_res)) > 0) {
  plot_core_kegg_res <- print(
    dotplot(core_kegg_res, showCategory = 15) + # Changed to 15
      labs(title = "KEGG Enrichment (ORA - Core Signature): Top 15 Pathways") + # Changed to 15
      theme_minimal()
  )
} else {
    print("No significant KEGG pathways to plot.")
}

4.5 : Heatmap of Core Signature Genes

Code
# 4.5 : Heatmap of Core Signature Genes


# We will use 'core_sig_gene_ids' (Ensembl, clean) from section 4.1
# We will use 'res_isogenic_df' (from 2.2) to get significance ranks
# We will use 'vsd_counts_all_samples' (the full 15-sample count matrix) from 2.1

res_isogenic_df <- as.data.frame(res_isogenic) %>%
  rownames_to_column("ensembl_id_version") # Keep original IDs with version

# Clean IDs for mapping (remove version)
res_isogenic_df$ensembl_id_clean <- gsub("\\..*$", "", res_isogenic_df$ensembl_id_version)

res_isogenic_df$symbol <- gene_map_symbols[res_isogenic_df$ensembl_id_clean]


res_isogenic_df$symbol <- ifelse(
    is.na(res_isogenic_df$symbol), 
    res_isogenic_df$ensembl_id_version, # Fallback to original ID
    res_isogenic_df$symbol
    )

res_isogenic_df <- res_isogenic_df %>%
  mutate(
    significant = case_when(
      padj < fdr_cutoff & abs(log2FoldChange) > log2fc_cutoff ~ "Significant",
      TRUE ~ "Not Significant" # 'TRUE' means "everything else"
    )
  )

# Get the VERSIONED Ensembl IDs for the TOP 50 core genes
top_50_core_versioned_ids <- res_isogenic_df %>%
  # First, keep only the genes from our core signature
  filter(ensembl_id_clean %in% core_sig_gene_ids) %>%
  # Next, sort them by significance (most significant first)
  arrange(padj) %>%
  # Now, take only the top 50
  head(50) %>%
  # Finally, pull their versioned IDs for the count matrix
  pull(ensembl_id_version)

# Get Normalized Counts for these 50 genes across ALL 15 samples
top_50_core_counts <- vsd_counts_all_samples[top_50_core_versioned_ids, ]

# Robust Gene Symbol Conversion
gene_map_top_50_core <- mapIds(
  org.Hs.eg.db,
  keys = rownames(top_50_core_counts),
  keytype = "ENSEMBL",
  column = "SYMBOL"
)
'select()' returned 1:many mapping between keys and columns
Code
gene_symbols_top_50_core <- ifelse(is.na(gene_map_top_50_core), names(gene_map_top_50_core), gene_map_top_50_core)
rownames(top_50_core_counts) <- make.names(gene_symbols_top_50_core, unique = TRUE)

# Create Column Annotations (for all 15 samples)
# We assume 'colData' is still available
annotation_col_all <- data.frame(
  Status = colData$status # Use the 'status' column (Patient/Control)
)
rownames(annotation_col_all) <- colnames(top_50_core_counts) 

# Generate the Heatmap
pheatmap(
  top_50_core_counts,
  scale = "row",        # Scale genes to see relative changes (Z-score)
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  show_rownames = TRUE, # <-- CHANGED (We can now show the names)
  show_colnames = TRUE, 
  main = "Heatmap of Top 50 Core Signature Genes (All Samples)", # <-- CHANGED
  annotation_col = annotation_col_all,
  border_color = NA,
  fontsize_row = 8      # <-- RE-ENABLED
)

PHASE 5 : Transcription Factor Analysis

5.1 : Upstream Transcription Factor (TF) Enrichment

Code
# 5.1 : Upstream Transcription Factor (TF) Enrichment

# Load Libraries
library(decoupleR)
library(gt)        # For aesthetic tables
library(ggrepel)   # For non-overlapping plot labels

# --- 2. Get the Regulon (TF-target list) ---
# We use DoRothEA levels A, B, C for a high-confidence regulon
iso_regulon_dorothea <- get_dorothea(
  organism = "human",
  levels = c('A', 'B', 'C') 
)
Warning in system2("quarto", "-V", stdout = TRUE, env = paste0("TMPDIR=", :
running command '"quarto"
TMPDIR=C:/Users/RAVAL/AppData/Local/Temp/RtmpeWuu9C/file6fa4728b1539 -V' had
status 1
Code
# Prepare our Gene List (as a Matrix)
# We re-use 'res_isogenic_df' from section 2.2
iso_tf_input_data <- res_isogenic_df %>%
  # Filter out genes that have no stat or no symbol
  filter(!is.na(stat) & !is.na(symbol) & !grepl("^ENSG", symbol)) %>% 
  dplyr::select(symbol, stat) %>%
  distinct(symbol, .keep_all = TRUE) # Keep one stat per gene symbol

# Convert to a named vector
iso_tf_stat_vector <- iso_tf_input_data$stat
names(iso_tf_stat_vector) <- iso_tf_input_data$symbol

# Coerce to a 1-column matrix (required for run_mlm)
iso_tf_stat_matrix <- as.matrix(iso_tf_stat_vector)
colnames(iso_tf_stat_matrix) <- "isogenic_comparison"

# Run the TF Enrichment (using 'mlm' method)
iso_tf_activities <- run_mlm(
  mat = iso_tf_stat_matrix, 
  network = iso_regulon_dorothea,
  .source = "source",  # TF column in regulon
  .target = "target",  # Target gene column in regulon
  .mor = "mor",        # Mode of regulation (score)
  minsize = 5          # Min 5 target genes per TF
)

# Process and Tidy Results
iso_tf_results <- iso_tf_activities %>%
  # 'source' is the TF, 'condition' is our matrix column name
  dplyr::select(TF = source, Activity_Score = score, P_Value = p_value) %>%
  # Create a BH-adjusted p-value column
  mutate(p_adj = p.adjust(P_Value, method = "BH")) %>%
  arrange(p_adj)

# Display Top TFs (gt Table)
iso_tf_results %>%
  head(20) %>%
  gt() %>%
  tab_header(
    title = "Top 20 Predicted TFs (Isogenic Analysis)",
    subtitle = "Based on multi-linear model (mlm) enrichment"
  ) %>%
  cols_label(
    TF = "Transcription Factor",
    Activity_Score = "Activity Score",
    P_Value = "P-Value",
    p_adj = "Adjusted P-Value"
  ) %>%
  fmt_scientific(
    columns = c(P_Value, p_adj),
    decimals = 2
  ) %>%
  fmt_number(
    columns = Activity_Score,
    decimals = 3
  ) %>%
  data_color(
    columns = Activity_Score,
    palette = c("#3C5488", "white", "#E64B35") # Blue -> White -> Red
  ) %>%
  tab_options(
    table.border.top.color = "black",
    heading.title.font.weight = "bold",
    column_labels.font.weight = "bold"
  )
Top 20 Predicted TFs (Isogenic Analysis)
Based on multi-linear model (mlm) enrichment
Transcription Factor Activity Score P-Value Adjusted P-Value
EGR1 −6.359 2.07 × 10−10 6.12 × 10−8
ATF4 5.580 2.43 × 10−8 3.60 × 10−6
NR2C2 5.428 5.76 × 10−8 5.68 × 10−6
PRDM14 −4.754 2.01 × 10−6 1.48 × 10−4
SOX2 −4.384 1.17 × 10−5 6.94 × 10−4
TFAP2C −3.546 3.92 × 10−4 1.93 × 10−2
MYCN 3.445 5.72 × 10−4 2.42 × 10−2
MITF 3.299 9.70 × 10−4 3.59 × 10−2
SRF −3.130 1.75 × 10−3 5.34 × 10−2
PBX3 −3.120 1.81 × 10−3 5.34 × 10−2
THAP11 3.093 1.98 × 10−3 5.34 × 10−2
IRF3 −3.031 2.44 × 10−3 5.45 × 10−2
ZNF384 3.014 2.58 × 10−3 5.45 × 10−2
ZNF263 −3.015 2.58 × 10−3 5.45 × 10−2
MEF2C −2.825 4.73 × 10−3 8.85 × 10−2
NR2F2 −2.821 4.79 × 10−3 8.85 × 10−2
GATA6 −2.743 6.09 × 10−3 1.06 × 10−1
ETS1 2.617 8.89 × 10−3 1.38 × 10−1
FOXO3 −2.621 8.76 × 10−3 1.38 × 10−1
STAT2 −2.535 1.12 × 10−2 1.66 × 10−1
Code
# --- 7. Visualize Top TFs (Bar Plot) ---
iso_tf_barplot <- iso_tf_results %>%
  head(20) %>%
  mutate(TF = reorder(TF, Activity_Score)) %>%
  ggplot(aes(x = TF, y = Activity_Score, fill = Activity_Score > 0)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(
    values = c("TRUE" = "#E64B35", "FALSE" = "#3C5488"),
    labels = c("Repressed", "Activated"),
    name = "TF Activity"
  ) +
  labs(
    title = "Top 20 Enriched Transcription Factors (Analysis 1)",
    subtitle = "Based on Isogenic (Patient1 vs. CRISPR_P1) DEGs",
    x = "Transcription Factor",
    y = "Activity Score (MLM)"
  ) +
  theme_minimal(base_size = 12) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black")

print(iso_tf_barplot)

Code
# --- 8. Visualize All TFs (Volcano Plot) ---
iso_tf_volcano <- iso_tf_results %>%
  mutate(
    neg_log_p_adj = -log10(p_adj),
    significant = p_adj < fdr_cutoff & abs(Activity_Score) > 0.5 
  ) %>%
  ggplot(aes(x = Activity_Score, y = neg_log_p_adj, color = significant, label = TF)) +
  geom_point(size = 2, alpha = 0.7) +
  scale_color_manual(
    values = c("FALSE" = "gray70", "TRUE" = "#E64B35"),
    name = "Significant"
  ) +
  geom_hline(yintercept = -log10(fdr_cutoff), linetype = "dashed", color = "black") +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "black") +
  ggrepel::geom_text_repel(
    data = . %>% filter(p_adj < 0.001 & abs(Activity_Score) > 1), # Label top hits
    size = 3.5,
    max.overlaps = 15
  ) +
  labs(
    title = "TF Activity Landscape (Analysis 1)",
    subtitle = "Significance vs. Effect Size",
    x = "TF Activity Score (MLM)",
    y = "-log10(Adjusted P-value)"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

print(iso_tf_volcano)

5.2 Analysis of Key TFs

Code
# 5.2 : Analysis of Key TFs

# Check Key Neurodevelopmental TFs
neuro_tfs_to_check <- c(
  "FOXA2", "LMX1A", "LMX1B", "NR4A2", "PITX3", "EN1", "EN2", 
  "SOX2", "PAX6", "NEUROD1", "TBR1", "REST", "FOXO1", "CREB1", "MEF2C",
  "EGR1", "CTCF", "ATF4" # Adding TFs from your report
)

iso_neuro_tf_results <- iso_tf_results %>%
  filter(TF %in% neuro_tfs_to_check) %>%
  arrange(p_adj) %>%
  mutate(
    Direction = ifelse(Activity_Score > 0, "Activated", "Repressed"),
    Significant = ifelse(p_adj < fdr_cutoff, "Yes", "No")
  )

# Display with gt
iso_neuro_tf_results %>%
  gt() %>%
  tab_header(title = "Activity of Key Neurodevelopmental TFs") %>%
  fmt_scientific(columns = P_Value, decimals = 2) %>%
  fmt_scientific(columns = p_adj, decimals = 2) %>%
  fmt_number(columns = Activity_Score, decimals = 3) %>%
  data_color(
    columns = Activity_Score,
    palette = c("#3C5488", "white", "#E64B35")
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      columns = Significant,
      rows = Significant == "Yes"
    )
  )
Activity of Key Neurodevelopmental TFs
TF Activity_Score P_Value p_adj Direction Significant
EGR1 −6.359 2.07 × 10−10 6.12 × 10−8 Repressed Yes
ATF4 5.580 2.43 × 10−8 3.60 × 10−6 Activated Yes
SOX2 −4.384 1.17 × 10−5 6.94 × 10−4 Repressed Yes
MEF2C −2.825 4.73 × 10−3 8.85 × 10−2 Repressed No
PAX6 −1.970 4.89 × 10−2 3.08 × 10−1 Repressed No
FOXO1 −0.738 4.61 × 10−1 8.63 × 10−1 Repressed No
NEUROD1 −0.642 5.21 × 10−1 9.04 × 10−1 Repressed No
CTCF −0.543 5.87 × 10−1 9.20 × 10−1 Repressed No
CREB1 0.509 6.11 × 10−1 9.21 × 10−1 Activated No
REST −0.408 6.83 × 10−1 9.36 × 10−1 Repressed No
FOXA2 0.385 7.00 × 10−1 9.46 × 10−1 Activated No
Code
# Check if Significantly Active TFs are also DEGs
iso_tf_significant_names <- iso_tf_results %>%
  filter(p_adj < fdr_cutoff) %>%
  pull(TF)

iso_tf_expression <- res_isogenic_df %>%
  filter(symbol %in% iso_tf_significant_names) %>%
  dplyr::select(TF = symbol, log2FoldChange, padj) %>%
  mutate(
    Status = ifelse(padj < fdr_cutoff, "Significant DE", "Not DE")
  ) %>%
  arrange(padj)

# Display with gt
iso_tf_expression %>%
  gt() %>%
  tab_header(title = "Expression Status of Significantly Active TFs") %>%
  fmt_number(columns = log2FoldChange, decimals = 2) %>%
  fmt_scientific(columns = padj, decimals = 2) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(
      columns = Status,
      rows = Status == "Significant DE"
    )
  ) %>%
  tab_footnote(
    footnote = "Note: Activity can change (e.g., via phosphorylation) even if the TF's own gene expression does not.",
    locations = cells_title()
  )
Expression Status of Significantly Active TFs1
TF log2FoldChange padj Status
MYCN −0.66 1.29 × 10−3 Significant DE
SOX2 −0.40 2.62 × 10−3 Significant DE
EGR1 −1.87 2.45 × 10−2 Significant DE
ATF4 0.28 1.11 × 10−1 Not DE
NR2C2 0.08 7.26 × 10−1 Not DE
MITF −0.14 7.40 × 10−1 Not DE
TFAP2C −0.06 9.67 × 10−1 Not DE
1 Note: Activity can change (e.g., via phosphorylation) even if the TF's own gene expression does not.
Code
# 3. Export Results to File
write.csv(
  iso_tf_results %>% filter(p_adj < fdr_cutoff),
  "TF_enrichment_significant_results.csv",
  row.names = FALSE
)

PHASE 6 : STRING DB

https://version-12-0.string-db.org/cgi/network?networkId=bAVOdXXjJ2xe

6.1 : STRING DB Network

6.2 : Top enrichment processes

PHASE 7 : Identifying potential lncRNA and Non-coding gene involvement

7.1 : Novel lncRNA & Non-Coding Gene Discovery

Code
# 7.1 : Novel Non-Coding Gene Discovery

# Load Libraries
library(biomaRt)
library(DT) # For interactive tables

# Get All Significant Gene IDs (from Isogenic Analysis)
# We re-use 'res_isogenic_df' from section 2.2
iso_sig_genes_df <- res_isogenic_df %>%
  filter(significant == "Significant") # Use the column we already made

iso_sig_ensembl_cleaned <- iso_sig_genes_df$ensembl_id_clean

# Find "Unmappable" IDs (the non-coding candidates)
# Find genes that do NOT map to an Entrez ID (which org.Hs.eg.db uses)
entrez_map <- mapIds(org.Hs.eg.db,
                     keys = iso_sig_ensembl_cleaned,
                     keytype = "ENSEMBL",
                     column = "ENTREZID",
                     multiVals = "first")
'select()' returned 1:many mapping between keys and columns
Code
iso_unmappable_ids <- names(entrez_map[is.na(entrez_map)])

print(paste("Found", length(iso_unmappable_ids), "significant genes not in Entrez/org.Hs.eg.db (non-coding candidates)"))
[1] "Found 357 significant genes not in Entrez/org.Hs.eg.db (non-coding candidates)"
Code
# Annotate Unknown IDs with biomaRt
if (length(iso_unmappable_ids) > 0) {
  ensembl_mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
  
  iso_gene_annotations <- getBM(
    attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'description'),
    filters = 'ensembl_gene_id',
    values = iso_unmappable_ids, 
    mart = ensembl_mart
  )

  # Join with expression data
  iso_novel_hits_df <- iso_gene_annotations %>%
    # Join annotations with our DESeq2 results
    left_join(
      res_isogenic_df %>% dplyr::select(ensembl_id_clean, log2FoldChange, padj, baseMean), 
      by = c("ensembl_gene_id" = "ensembl_id_clean")
    ) %>%
    distinct(ensembl_gene_id, .keep_all = TRUE) %>%
    arrange(padj) # Sort by most significant

  # --- 6. Display All Novel Hits (Interactive) ---
  print("--- Discovered Significant Non-Coding & Novel Genes ---")
  print(
    datatable(iso_novel_hits_df, 
              caption = "Top Dysregulated Non-Coding & Unmapped Genes",
              options = list(pageLength = 10, scrollX = TRUE),
              rownames = FALSE,
              filter = 'top'
              )
  )
  
  # Save the list
  write.csv(iso_novel_hits_df, "novel_significant_genes.csv", row.names = FALSE)

  # Filter for and Display lncRNAs
  iso_lncrna_hits_df <- iso_novel_hits_df %>%
    filter(gene_biotype == "lncRNA")
    
  print("--- Of these, the following are lncRNAs: ---")
  print(
    datatable(iso_lncrna_hits_df, 
              caption = "Top Dysregulated lncRNAs",
              options = list(pageLength = 10, scrollX = TRUE),
              rownames = FALSE,
              filter = 'top'
              )
  )
  
  write.csv(iso_lncrna_hits_df, "novel_significant_lncRNAs.csv", row.names = FALSE)
  
} else {
  print("No unmapped significant genes were found.")
}
[1] "--- Discovered Significant Non-Coding & Novel Genes ---"
[1] "--- Of these, the following are lncRNAs: ---"

7.2 : Co-expression Heatmap

Code
# 7.2 : Co-expression Heatmap (lncRNA vs. Key Genes) 

# Load Libraries
library(pheatmap)

# Define Target Gene Lists
# We re-use 'iso_lncrna_hits_df' from 7.1
iso_top_lncrnas <- iso_lncrna_hits_df %>%
  head(5) %>% # Take top 5 most significant
  pull(external_gene_name) # Get their gene symbols

# Key PD & Neurodevelopmental genes to check against
iso_key_dev_genes <- c(
  "DNAJC6", "SNCA", "CLTC",  # Key PD/CME genes
  "DLX1", "DLX2", "DLX5", "DLX6", # DLX family (partner for DLX6-AS1)
  "FOXA2", "NEUROD1", "LMX1A", "PAX6", # Top neurodev TFs from ORA
  "EGR1", "ATF4", # Top TFs from decoupleR
  "DNAJC6", # Your causal gene
  "SNCA",   # Alpha-synuclein
  "CLTC",   # Clathrin Heavy Chain
  "SYN1"    # Synapsin I
)

iso_all_heatmap_symbols <- unique(c(iso_top_lncrnas, iso_key_dev_genes))

# Get Normalized Counts for Target Genes
# We re-use 'vsd' (from 2.1) and 'res_isogenic_df' (from 2.2)
if (!exists("vsd")) {
  stop("Error: 'vsd' object not found. Please run the PCA chunk (2.1) first.")
}

# Find the Ensembl IDs (with version) that match our symbols
iso_target_ensembl_ids <- res_isogenic_df %>%
  filter(symbol %in% iso_all_heatmap_symbols) %>%
  distinct(symbol, .keep_all = TRUE) %>% # Handle duplicate symbols
  pull(ensembl_id_version)
  
# Get normalized counts for these genes across all 15 samples
# and transpose it (so genes are columns, samples are rows)
iso_cor_matrix_data <- t(assay(vsd)[iso_target_ensembl_ids, ])

# Set Column Names to Gene Symbols
# We need to map the versioned IDs back to symbols
symbol_map_df <- res_isogenic_df %>%
  filter(ensembl_id_version %in% iso_target_ensembl_ids) %>%
  dplyr::select(ensembl_id_version, symbol)

# Match the column names (versioned IDs) to their symbols
colnames(iso_cor_matrix_data) <- symbol_map_df$symbol[
  match(colnames(iso_cor_matrix_data), symbol_map_df$ensembl_id_version)
]

# Calculate Correlation Matrix
iso_cor_matrix <- cor(iso_cor_matrix_data, method = "pearson", use = "pairwise.complete.obs")

# Plot Correlation Heatmap
print(
  pheatmap(
    iso_cor_matrix,
    main = "Co-expression: Top lncRNAs vs. Key PD & Development Genes",
    fontsize_row = 10,
    fontsize_col = 10,
    color = colorRampPalette(c("#3C5488", "white", "#E64B35"))(50), # Blue-White-Red
    cluster_rows = TRUE,
    cluster_cols = TRUE,
    display_numbers = TRUE, # Show correlation values
    number_format = "%.2f", # Format to 2 decimal places
    fontsize_number = 8,
    border_color = "grey60"
  )
)

Code
### Session Information ###
sessionInfo()
R version 4.5.1 (2025-06-13 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26200)

Matrix products: default
  LAPACK version 3.12.1

locale:
[1] LC_COLLATE=English_India.utf8  LC_CTYPE=English_India.utf8   
[3] LC_MONETARY=English_India.utf8 LC_NUMERIC=C                  
[5] LC_TIME=English_India.utf8    

time zone: Asia/Calcutta
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] DT_0.34.0                   biomaRt_2.64.0             
 [3] gt_1.1.0                    decoupleR_2.9.7            
 [5] ggVennDiagram_1.5.4         RColorBrewer_1.1-3         
 [7] org.Hs.eg.db_3.21.0         AnnotationDbi_1.70.0       
 [9] EnhancedVolcano_1.26.0      ggrepel_0.9.6              
[11] enrichplot_1.28.4           clusterProfiler_4.16.0     
[13] pheatmap_1.0.13             edgeR_4.6.3                
[15] limma_3.64.3                DESeq2_1.48.1              
[17] SummarizedExperiment_1.38.1 Biobase_2.68.0             
[19] MatrixGenerics_1.20.0       matrixStats_1.5.0          
[21] GenomicRanges_1.60.0        GenomeInfoDb_1.44.3        
[23] IRanges_2.42.0              S4Vectors_0.46.0           
[25] BiocGenerics_0.54.1         generics_0.1.4             
[27] lubridate_1.9.4             forcats_1.0.1              
[29] stringr_1.5.2               dplyr_1.1.4                
[31] purrr_1.1.0                 readr_2.1.5                
[33] tidyr_1.3.1                 tibble_3.3.0               
[35] ggplot2_4.0.0               tidyverse_2.0.0            
[37] knitr_1.50                  htmltools_0.5.8.1          
[39] r3dmol_0.1.2               

loaded via a namespace (and not attached):
  [1] splines_4.5.1           later_1.4.4             filelock_1.0.3         
  [4] ggplotify_0.1.3         cellranger_1.1.0        R.oo_1.27.1            
  [7] XML_3.99-0.19           lifecycle_1.0.4         httr2_1.2.1            
 [10] tcltk_4.5.1             sf_1.0-21               vroom_1.6.6            
 [13] processx_3.8.6          lattice_0.22-7          crosstalk_1.2.2        
 [16] backports_1.5.0         magrittr_2.0.4          sass_0.4.10            
 [19] rmarkdown_2.30          jquerylib_0.1.4         yaml_2.3.10            
 [22] httpuv_1.6.16           otel_0.2.0              ggtangle_0.0.7         
 [25] zip_2.3.3               sessioninfo_1.2.3       chromote_0.5.1         
 [28] cowplot_1.2.0           DBI_1.2.3               abind_1.4-8            
 [31] rvest_1.0.5             R.utils_2.13.0          yulab.utils_0.2.1      
 [34] rappdirs_0.3.3          GenomeInfoDbData_1.2.14 tidytree_0.4.6         
 [37] units_1.0-0             parallelly_1.45.1       codetools_0.2-20       
 [40] DelayedArray_0.34.1     DOSE_4.2.0              xml2_1.4.1             
 [43] tidyselect_1.2.1        aplot_0.2.9             UCSC.utils_1.4.0       
 [46] farver_2.1.2            BiocFileCache_2.16.2    jsonlite_2.0.0         
 [49] e1071_1.7-16            tools_4.5.1             progress_1.2.3         
 [52] treeio_1.32.0           snow_0.4-4              Rcpp_1.1.0             
 [55] glue_1.8.0              SparseArray_1.8.1       xfun_0.53              
 [58] qvalue_2.40.0           websocket_1.4.4         withr_3.0.2            
 [61] fastmap_1.2.0           digest_0.6.37           timechange_0.3.0       
 [64] R6_2.6.1                mime_0.13               gridGraphics_0.5-1     
 [67] colorspace_2.1-2        GO.db_3.21.0            RSQLite_2.4.3          
 [70] R.methodsS3_1.8.2       utf8_1.2.6              data.table_1.17.8      
 [73] class_7.3-23            prettyunits_1.2.0       httr_1.4.7             
 [76] htmlwidgets_1.6.4       S4Arrays_1.8.1          pkgconfig_2.0.3        
 [79] gtable_0.3.6            blob_1.2.4              S7_0.2.0               
 [82] selectr_0.4-2           XVector_0.48.0          OmnipathR_3.17.9       
 [85] fgsea_1.34.2            scales_1.4.0            png_0.1-8              
 [88] ggfun_0.2.0             rstudioapi_0.17.1       tzdb_0.5.0             
 [91] reshape2_1.4.4          checkmate_2.3.3         nlme_3.1-168           
 [94] curl_7.0.0              proxy_0.4-27            cachem_1.1.0           
 [97] KernSmooth_2.23-26      parallel_4.5.1          pillar_1.11.1          
[100] grid_4.5.1              logger_0.4.1            vctrs_0.6.5            
[103] promises_1.4.0          dbplyr_2.5.1            xtable_1.8-4           
[106] evaluate_1.0.5          cli_3.6.5               locfit_1.5-9.12        
[109] compiler_4.5.1          rlang_1.1.6             crayon_1.5.3           
[112] labeling_0.4.3          classInt_0.4-11         ps_1.9.1               
[115] plyr_1.8.9              fs_1.6.6                stringi_1.8.7          
[118] BiocParallel_1.42.1     Biostrings_2.76.0       lazyeval_0.2.2         
[121] GOSemSim_2.34.0         Matrix_1.7-4            hms_1.1.4              
[124] patchwork_1.3.2         bit64_4.6.0-1           KEGGREST_1.48.1        
[127] statmod_1.5.1           shiny_1.11.1            igraph_2.2.1           
[130] memoise_2.0.1           bslib_0.9.0             ggtree_3.16.3          
[133] fastmatch_1.1-6         bit_4.6.0               readxl_1.4.5           
[136] ape_5.8-1               gson_0.1.0